---
title: "Replicating 'Lost in Space'"
author: "Sophie J. Lee, Howard Liu, and Michael D. Ward"
date: "03/11/2018"
output:
  pdf_document: default
  html_document:
    df_print: paged
---

## Overview 

This RMD file contains the code necessary to generate the tables and figures found in ``Lost in Space: Geolocation in Event Data,'' to appear in Political Science Research and Methods, 2018. It is distributed with the necessary data and files structured so that it can be run independently on a variety of platforms.  

The maps are not replicated herein. A separate file, plotly_static.R, generates them as png images.  

The accompanying files may be found in the compressed archive; they are

* READ_ME.txt
* china_cleantext.csv
* china_province.csv
* china_results.all.csv
* colombia_cleantext.csv
* colombia_province.xlsx
* colombia_results.all.csv
* drc_cleantext.csv
* drc_province.csv
* drc_results.all.csv
* LIS-PSRM-REPL_pdf.Rmd (this file)
* results_china.csv
* results_colombia.csv
* results_drc.csv
* results_syria.csv
* sample100.csv
* syria_cleantext.csv
* syria_province.csv
* syria_results.all.csv
* tab6MS.png (PRE-EXISTING COPY OF ORIGINAL TABLE IN MANUSCRIPT)
* tab8MS.png (PRE-EXISTING COPY OF ORIGINAL TABLE IN MANUSCRIPT)
* fig3MS.png (PRE-EXISTING COPY OF ORIGINAL TABLE IN MANUSCRIPT)
* plotly_static.R (file generating Fig 4 to be run in Rstudio)



Tables 1-5 are illustrative and are not replicated. The same is true for Figure 1.
\newpage
\clearpage




```{r setup, include=FALSE, echo=FALSE, message=FALSE}
library("knitr")

if(!file.exists("china_province.csv")|
   !file.exists("china_cleantext.csv")|
   !file.exists("results_china.csv")|!file.exists("china_results.all.csv")|
   !file.exists("drc_province.csv")|
   !file.exists("drc_cleantext.csv")|
   !file.exists("results_drc.csv")|!file.exists("drc_results.all.csv")|
   !file.exists("syria_province.csv")|
   !file.exists("syria_cleantext.csv")|
   !file.exists("results_syria.csv")|!file.exists("syria_results.all.csv")|
   !file.exists("colombia_province.xlsx")|
   !file.exists("colombia_cleantext.csv")|
   !file.exists("results_colombia.csv")|!file.exists("colombia_results.all.csv")
){print('ERROR: The working directory must contain both dictionary and data files.')
} else{ print('Please proceed.')}

#Helper functions
package = function(names)
{
  for(name in names)
  {
    if (!(name %in% installed.packages()))
      install.packages(name, repos="http://cran.us.r-project.org")
    
    library(name, character.only=TRUE)
  }
}

buildY<-function(texts, namespace, solution, text_no){
  library(stringr)
  n=length(texts)
  l=t=y=no_location=NULL
  for(i in 1:n){
    TF<-str_detect(texts[i], namespace)
        if(length(which(TF==TRUE))>0){
      locations<-namespace[which(TF==TRUE)]
      hu<-tolower(solution[i])
      if(length(hu)==0 | hu=="NA" |is.na(hu) ){
        hu<-"EWW"}  
      wrong=NULL
      short=rep(NA, length(locations))
      for(c in 1:length(locations)){
        if(str_detect(hu, str_trim(locations[c], "both") ) ){
          short[c]<-1    
        } else{
          wrong=c(wrong, str_trim(locations[c], "both") )
          short[c]<-0   
        } 
        }
      } else{ no_location<-c(no_location, i); locations<-NA; short<-NA  }
        if(missing(text_no)) {
      index<-i
    } else {index<- text_no[i]}
    l=c(l, locations)   
    t=c(t, rep(index, length(locations))) 
    y=c(y, short)   }
  binary_data<-as.data.frame(cbind("text_id"=t, "location_names"=l, "Y"=y))
  print(no_location)
  print('Text numbers of those without the location names. Ignore this message if 1) no numbers are printed, 2) these texts do not include any location words, or 3) they do not have event locations.')
  return(binary_data)
}

rocdf <- function(pred, obs, data=NULL, type=NULL) {
  library(ROCR)
  if (!is.null(data)) {
    pred <- eval(substitute(pred), envir=data)
    obs  <- eval(substitute(obs), envir=data)
  }
  rocr_xy <- switch(type, roc=c("tpr", "fpr"), pr=c("prec", "rec"))
  rocr_df <- prediction(pred, obs)
  rocr_pr <- performance(rocr_df, rocr_xy[1], rocr_xy[2])
  xy <- data.frame(rocr_pr@x.values[[1]], rocr_pr@y.values[[1]])
   if (type=="pr") {
    xy[1, 2] <- 0   }
  colnames(xy) <- switch(type, roc=c("tpr", "fpr"), pr=c("rec", "prec"))
  return(xy)
}

AtoBinC<-function(A, B, C, exact.matching=T){ 
  A<-tolower(as.character(A))
  B<-tolower(as.character(B))
  C<-tolower(as.character(C))
    for ( i in 1:length(A)){
      if(exact.matching==T){      pattern<-paste0("\\b",tolower(A[i]),"\\b") }else{
        pattern<-tolower(A[i])
      }
      to<-paste0("\\",B[i], "\\")
  C<- str_replace_all(C, pattern, to)
  }
  return(as.character(C))
}


incNgrams<-function(texts, namespace, solution, min, max){
  if(missing(min)){
    min=2  }
  if(missing(max)){
    max=7  }
  library(doParallel)
  cluster=makeCluster(6)
  registerDoParallel(cluster)
  Ngrams_incorrect <<- foreach(m=min:max) %dopar%{
    library(stringr)
    #library(RWeka)
    Ngrams_incorrect=list(NULL)
    n=length(texts)
    phrases=NULL
    for(i in 1:n){
      TF<-str_detect(texts[i], namespace)
      if(length(which(TF==TRUE))>0){
        locations<-namespace[which(TF==TRUE)]
        hu<-tolower(solution[i]); if(is.na(hu)){hu<-"EWW"}
        wrong=NULL
        for(c in 1:length(locations)){
          prep <-texts[i]
           namespace.2<-namespace[-which(namespace==locations[c])]
          for(t in namespace.2){
            prep<-str_replace_all(prep, t, "OTHERLOCZ")
          }
          prep <-str_split(prep, "\\.")[[1]]
          
           test = NULL
          for(nn in 1: length(prep)){
            prep_tmp =  trimws(prep[nn], "l") # remove leading white space
            if(nchar(prep_tmp) > 0){ # must more than one word
            test_tmp = ngram::ngram_asweka(prep_tmp , min=2, max=2)
            test = c(test, test_tmp)
            }
          }
          if(str_detect(hu, str_trim(locations[c], "both") ) ){
                    } else{
            wrong=c(wrong, str_trim(locations[c], "both") )
                      }
                    j=length(wrong)
          if(j>0){
            for(a in 1:j){
              temp<-test[which(!is.na(str_match(test, wrong[a])))]
              temp<-str_replace_all(temp, wrong[a], "LOCZ")
              temp<-str_replace_all(temp, "LOCZ(.*?)\\b", "LOCZ")
              phrases=c(phrases,temp)
            }
          } else{phrases=c(phrases,NA)}
        }
      } else{
        phrases=c(phrases,NA) }
    }
    matrix<-as.data.frame(table(phrases))
    Ngrams_incorrect[m]<-list(matrix[order(matrix$Freq, decreasing=TRUE),])
    return(Ngrams_incorrect[m])
  }
  stopCluster(cluster)
  return(Ngrams_incorrect)
}

corNgrams<-function(texts, namespace, solution, min, max){
  if(missing(min)){
    min=2  }
  if(missing(max)){
    max=7  }
  library(doParallel)
  cluster=makeCluster(6)
  registerDoParallel(cluster)
  Ngrams_correct <<- foreach(m=min:max) %dopar%{
    library(stringr)
    #library(RWeka)
    Ngrams_correct=list(NULL)
    n=length(texts)
    phrases=NULL
    for(i in 1:n){
      TF<-str_detect(texts[i], namespace)
      if(length(which(TF==TRUE))>0){
        locations<-namespace[which(TF==TRUE)]
        hu<-tolower(solution[i]); if(is.na(hu)){hu<-"EWW"}
        correct=NULL
        for(c in 1:length(locations)){
            if(str_detect(hu, str_trim(locations[c], "both")     )){
            correct=c(correct,str_trim(locations[c], "both"))
          }
        }
        prep <-texts[i]
        prep <-str_split(prep, "\\.")[[1]]
        
         test = NULL
          for(nn in 1: length(prep)){
            prep_tmp =  trimws(prep[nn], "l") # remove leading white space
            if(nchar(prep_tmp) > 0){ # must more than one word
            test_tmp = ngram::ngram_asweka(prep_tmp , min=2, max=2)
            test = c(test, test_tmp)
            }
          }
        j=length(correct)
        if(j>0){
          for(a in 1:j){
            temp<-str_replace_all(test, paste0("\\b",correct[a], "\\b"), "LOCZ")
            temp<-str_replace_all(temp, paste0("\\bsub_",correct[a], "\\b"), "sub_LOCZ")
            namespace.2<-locations[locations!=correct[a]]
            for(t in namespace.2){
              temp<-str_replace_all(temp, t, "OTHERLOCZ")
            }
           temp<-temp[which(!is.na(str_match(temp, "\\bLOCZ|sub_LOCZ\\b")))]
           phrases=c(phrases,temp)
          }
        } else{
          phrases=c(phrases,NA)
        }
      }else{
        phrases=c(phrases,NA)
      }
    }
    matrix<-as.data.frame(table(phrases))
    Ngrams_correct[m]<-list(matrix[order(matrix$Freq, decreasing=TRUE),])
    return(Ngrams_correct[m])
  }
  stopCluster(cluster)
  return(Ngrams_correct)
}

# packages
datamanagement = c("stringr","plyr","tidyr","ngram","magrittr")
textanalysis = c("tm")
classification = c("randomForest","e1071","nnet","caret")
plotting = c("RgoogleMaps","AUC","ROCR","caTools","plotly","htmltools","shiny")
performance = "doParallel"
readingfiles = "readxl"
package(datamanagement)
package(textanalysis)
package(classification)
package(plotting)
package(performance)
package(readingfiles)
```


## Table 6  

```{r tab6, results='asis', echo=FALSE, message=FALSE,tab.cap="Replicated Table 6"}
sample100<-read.csv("sample100.csv")
china_province<-read.csv('china_province.csv')
names<-tolower(unique(china_province$province))
Ngrams_correct<-corNgrams(texts=as.character(sample100$text),     
                          namespace=names,
                          solution=as.character(sample100$province_human),
                          min=2, max=2)
correct<-as.data.frame(Ngrams_correct[[1]])[1:12,]
rownames(correct)<-NULL
Ngrams_incorrect<-incNgrams(texts=as.character(sample100$text),    
                            namespace=names,
                            solution=as.character(sample100$province_human), 
                            min=2, max=2)
incorrect<-as.data.frame(Ngrams_incorrect[[1]])[1:12,]
rownames(incorrect)<-NULL
kable(cbind(correct, incorrect),caption="Replicated Table 6")
```


```{r fig.width=4, fig.show='hold', fig.align="center",message=FALSE, echo=FALSE,fig.cap="Table 6 from Manuscript"}
library(png)
library(grid)
library(gridExtra)
img1 <-  rasterGrob(as.raster(readPNG("tab6MS.png")))
grid.arrange(img1,ncol = 1)
```



Table 7 and Figure 2 are illustrative, and are not replicated.

\newpage

## Table 8 
This presents the Accuracy, Precision, and Recall Rates of the Proposed and Existing Methods, and contains some of the main results from the article.

```{r table8a, include=FALSE, echo=FALSE, message=FALSE,fig.cap="Replicated Table 8"}
## China
china_province<-read.csv("china_province.csv")
china_province$province<-as.character(iconv(china_province$province, "ASCII", "utf-8", sub=""))
china_province$city<-as.character(iconv(china_province$sublevel, "ASCII", "utf-8", sub=""))
names<-c(as.character(unique(tolower(china_province$province))))
names<-names[order(names)]
names<-str_replace_all(names," ","") 
china_data<-read.csv("china_cleantext.csv")
china_data$cleantext<-as.character(china_data$cleantext)
china_data$province_human<-as.character(china_data$province_human)
china_data$province_ICEWS<-as.character(china_data$province_ICEWS)
data<-china_data
ground_truth.china<-buildY(texts=data$cleantext,
                           namespace=names,
                           solution=as.character(data$province_human),
                           text_no=data$story_id)
china_summary<-read.csv("results_china.csv")
china_results.all<-read.csv("china_results.all.csv")
results<-china_results.all
# NNet
chinaNN_acc <- round(mean(china_summary$nnet)*100)
nnet_no<-which(results$machine.predicted.nnet==1)
temp<- as.vector(table(results$original[nnet_no]))[2] / (as.vector(table(results$original[nnet_no]))[2] + as.vector(table(results$original[nnet_no]))[1]) 
chinaNN_pre<- round(temp*100) 
chinaNN_rec <-round ( as.vector(table(results$original[nnet_no]))[2] / as.vector(table(ground_truth.china$Y))[2] * 100)
# SVM
chinaSVM_acc <- round(mean(china_summary$svm)*100)
svm_no<-which(results$machine.predicted.svm==1)
temp<-  as.vector(table(results$original[svm_no]))[2] / ( as.vector(table(results$original[svm_no]))[1] + as.vector(table(results$original[svm_no]))[2] )
chinaSVM_pre <- round(temp*100)
chinaSVM_rec <- round(as.vector(table(results$original[svm_no]))[2]  / as.vector(table(ground_truth.china$Y))[2] * 100)
#RF
chinaRF_acc <- round(mean(china_summary$rf)*100)
rf_no<-which(results$machine.predicted.rf==1)
temp<- as.vector(table(results$original[rf_no]))[2] / ( as.vector(table(results$original[rf_no]))[1] + as.vector(table(results$original[rf_no]))[2]  )
chinaRF_pre <- round(temp*100)
chinaRF_rec <- round( as.vector(table(results$original[rf_no]))[2] / as.vector(table(ground_truth.china$Y))[2] * 100)
## EXISTING CODER
data$score = NA
i = 1
for(i in 1:nrow(data)){
  human = data$province_human[i]%>% str_split(., ",") %>% unlist %>% unique
  icews = data$province_ICEWS[i]%>% str_split(., ",") %>% unlist %>% unique
  result = human %in% icews
  data$score[i] = length(which(result == TRUE))  /length(result)
}
chinaEXT_acc <- round(mean(data$score)*100)
evaluate_icews.china<-buildY(texts=data$cleantext,
                             namespace=names,
                             solution=data$province_ICEWS,
                             text_no=data$story_id)
icews_no<-which(evaluate_icews.china$Y==1)
temp<- as.vector(table(ground_truth.china$Y[icews_no]))[2] / (as.vector(table(ground_truth.china$Y[icews_no]))[1] + as.vector(table(ground_truth.china$Y[icews_no]))[2])
chinaEXT_pre <- round(temp*100)
chinaEXT_rec <- round(as.vector(table(ground_truth.china$Y[icews_no]))[2] / as.vector(table(ground_truth.china$Y))[2] * 100)

## DRC
drc_province<-read.csv("drc_province.csv") 
drc_province$city<-as.character(drc_province$city)
drc_province$province<-as.character(drc_province$province)
names<-unique(drc_province$province)
drc_data<-read.csv("drc_cleantext.csv")
drc_data$cleantext<-as.character(unlist(drc_data$cleantext))
drc_data$cleantext<-str_replace_all(drc_data$cleantext, "NONAVERB", "NONTOPIC")
drc_data$province_human<-as.character(unlist(drc_data$province_human))
drc_data$province_human<-str_trim(drc_data$province_human, side="both")
drc_data$province_ICEWS<-as.character(unlist(drc_data$province_ICEWS))
data<-drc_data
ground_truth.drc<-buildY(texts=data$cleantext,
                         namespace=names,
                         solution=as.character(data$province_human),
                         text_no=data$story_id)
drc_summary<-read.csv("results_drc.csv")
drc_results.all<-read.csv("drc_results.all.csv")
results<-drc_results.all
# NNet
drcNN_acc <- round(mean(drc_summary$nnet)*100)
nnet_no<-which(results$machine.predicted.nnet==1)
temp<- as.vector(table(results$original[nnet_no]))[2] / (as.vector(table(results$original[nnet_no]))[2] + as.vector(table(results$original[nnet_no]))[1]) 
drcNN_pre<- round(temp*100) 
drcNN_rec <-round ( as.vector(table(results$original[nnet_no]))[2] / as.vector(table(ground_truth.drc$Y))[2] * 100)
# SVM
drcSVM_acc <- round(mean(drc_summary$svm)*100)
svm_no<-which(results$machine.predicted.svm==1)
temp<-  as.vector(table(results$original[svm_no]))[2] / ( as.vector(table(results$original[svm_no]))[1] + as.vector(table(results$original[svm_no]))[2] )
drcSVM_pre <- round(temp*100)
drcSVM_rec <- round(as.vector(table(results$original[svm_no]))[2]  / as.vector(table(ground_truth.drc$Y))[2] * 100)
#RF
drcRF_acc <- round(mean(drc_summary$rf)*100)
rf_no<-which(results$machine.predicted.rf==1)
temp<- as.vector(table(results$original[rf_no]))[2] / ( as.vector(table(results$original[rf_no]))[1] + as.vector(table(results$original[rf_no]))[2]  )
drcRF_pre <- round(temp*100)
drcRF_rec <- round( as.vector(table(results$original[rf_no]))[2] / as.vector(table(ground_truth.drc$Y))[2] * 100)
# Existing coder
data$score = NA
for(i in 1:nrow(data)){
  human = data$province_human[i]%>% str_split(., ",") %>% unlist %>% unique %>% str_replace_all(., " ","")
  icews = data$province_ICEWS[i]%>% str_split(., ",") %>% unlist %>% unique %>% str_replace_all(., " ","")
  result = human %in% icews
  data$score[i] = length(which(result == TRUE))  /length(result)
}
drcEXT_acc <- round(mean(data$score)*100)
evaluate_icews.drc<-buildY(texts=data$cleantext,
                           namespace=names,
                           solution=data$province_ICEWS,
                           text_no=data$story_id)
icews_no<-which(evaluate_icews.drc$Y==1)
temp<- as.vector(table(ground_truth.drc$Y[icews_no]))[2]  / ( as.vector(table(ground_truth.drc$Y[icews_no]))[1] + as.vector(table(ground_truth.drc$Y[icews_no]))[2] )
drcEXT_pre <- round(temp*100)
drcEXT_rec <- round(as.vector(table(ground_truth.drc$Y[icews_no]))[2] / as.vector(table(ground_truth.drc$Y))[2] * 100)

##Syria
syria_province<-read.csv("syria_province.csv")
names=c(str_trim(as.character(tolower(syria_province$governorate))))
names<-unique(names[order(names)])
syria_data<-read.csv('syria_cleantext.csv')
syria_data$cleantext<-as.character(unlist(syria_data$cleantext))
syria_data$province_human<-as.character(unlist(syria_data$province_human))
syria_data$province_OEDA<-as.character(unlist(syria_data$province_oeda))
data<-syria_data
ground_truth.syria<-buildY(texts=data$cleantext,
                           namespace=names,
                           solution=as.character(data$province_human),
                           text_no=data$story_id)
syria_summary<-read.csv("results_syria.csv")
syria_results.all<-read.csv("syria_results.all.csv")
results<-syria_results.all
#NNet
syriaNN_acc <- round(mean(syria_summary$nnet)*100)
nnet_no<-which(results$machine.predicted.nnet==1)
temp<- as.vector(table(results$original[nnet_no]))[2] / (as.vector(table(results$original[nnet_no]))[2] + as.vector(table(results$original[nnet_no]))[1]) 
syriaNN_pre<- round(temp*100) 
syriaNN_rec <-round ( as.vector(table(results$original[nnet_no]))[2] / as.vector(table(ground_truth.syria$Y))[2] * 100)
# SVM
syriaSVM_acc <- round(mean(syria_summary$svm)*100)
svm_no<-which(results$machine.predicted.svm==1)
temp<-  as.vector(table(results$original[svm_no]))[2] / ( as.vector(table(results$original[svm_no]))[1] + as.vector(table(results$original[svm_no]))[2] )
syriaSVM_pre <- round(temp*100)
syriaSVM_rec <- round(as.vector(table(results$original[svm_no]))[2]  / as.vector(table(ground_truth.syria$Y))[2] * 100)
#RF
syriaRF_acc <- round(mean(syria_summary$rf)*100)
rf_no<-which(results$machine.predicted.rf==1)
temp<- as.vector(table(results$original[rf_no]))[2] / ( as.vector(table(results$original[rf_no]))[1] + as.vector(table(results$original[rf_no]))[2]  )
syriaRF_pre <- round(temp*100)
syriaRF_rec <- round( as.vector(table(results$original[rf_no]))[2] / as.vector(table(ground_truth.syria$Y))[2] * 100)
# Existing coder
data$score = NA
for(i in 1:nrow(data)){
  human = data$province_human[i]%>% str_split(., ",") %>% unlist %>% unique %>% str_replace_all(., " ","")
  icews = data$province_OEDA[i]%>% str_split(., ",") %>% unlist %>% unique %>% str_replace_all(., " ","")
  result = human %in% icews
  data$score[i] = length(which(result == TRUE))  /length(result)
}
syriaEXT_acc <- round(mean(data$score)*100)
evaluate_phoenix.syria<-buildY(texts=data$cleantext,
                               namespace=names,
                               solution=data$province_oeda, 
                               text_no=data$story_id)
phoenix_no<-which(evaluate_phoenix.syria$Y==1)
temp<- as.vector(table(ground_truth.syria$Y[phoenix_no]))[2] / (as.vector(table(ground_truth.syria$Y[phoenix_no]))[1] + as.vector(table(ground_truth.syria$Y[phoenix_no]))[2])
syriaEXT_pre <- round(temp*100)
syriaEXT_rec <- round(as.vector(table(ground_truth.syria$Y[phoenix_no]))[2] / as.vector(table(ground_truth.syria$Y))[2] * 100)

## Colombia
colombia_province<-read_excel('colombia_province.xlsx')
names<-as.character(unique(colombia_province$province_nospacing))
names<-names[order(nchar(names), decreasing=T)]
colombia_data<-read.csv("colombia_cleantext.csv")
colombia_data$cleantext<-as.character(colombia_data$cleantext)
colombia_data$province_human<-as.character(colombia_data$province_human)
colombia_data$province_ICEWS<-as.character(colombia_data$province_ICEWS)
colombia_data$cleantext<-str_replace_all(colombia_data$cleantext, "nortedesantander", "nortedesaantander")
colombia_data$cleantext<-str_replace_all(colombia_data$cleantext, "valledelcauca", "valledelcaauca")
colombia_data$province_human<-str_replace_all(colombia_data$province_human,  "nortedesantander", "nortedesaantander")
colombia_data$province_human<-str_replace_all(colombia_data$province_human, "valledelcauca", "valledelcaauca")
colombia_data$province_ICEWS<-str_replace_all(colombia_data$province_ICEWS, "nortedesantander", "nortedesaantander")
colombia_data$province_ICEWS<-str_replace_all(colombia_data$province_ICEWS, "valledelcauca", "valledelcaauca")
names<-str_replace_all(names, "nortedesantander", "nortedesaantander")
names<-str_replace_all(names,  "valledelcauca", "valledelcaauca")
data<-colombia_data
ground_truth.colombia<-buildY(texts=data$cleantext,
                              namespace=names,
                          solution=as.character(data$province_human),
                              text_no=data$story_id)
colombia_summary<-read.csv("results_colombia.csv")
colombia_results.all<-read.csv("colombia_results.all.csv")
results<-colombia_results.all
#NNet
colNN_acc <- round(mean(colombia_summary$nnet)*100)
nnet_no<-which(results$machine.predicted.nnet==1)
temp<- as.vector(table(results$original[nnet_no]))[2] / (as.vector(table(results$original[nnet_no]))[2] + as.vector(table(results$original[nnet_no]))[1]) 
colNN_pre<- round(temp*100) 
colNN_rec <-round ( as.vector(table(results$original[nnet_no]))[2] / as.vector(table(ground_truth.colombia$Y))[2] * 100)
# SVM
colSVM_acc <- round(mean(colombia_summary$svm)*100)
svm_no<-which(results$machine.predicted.svm==1)
temp<-  as.vector(table(results$original[svm_no]))[2] / ( as.vector(table(results$original[svm_no]))[1] + as.vector(table(results$original[svm_no]))[2] )
colSVM_pre <- round(temp*100)
colSVM_rec <- round(as.vector(table(results$original[svm_no]))[2]  / as.vector(table(ground_truth.colombia$Y))[2] * 100)
#RF
colRF_acc <- round(mean(colombia_summary$rf)*100)
rf_no<-which(results$machine.predicted.rf==1)
temp<- as.vector(table(results$original[rf_no]))[2] / ( as.vector(table(results$original[rf_no]))[1] + as.vector(table(results$original[rf_no]))[2]  )
colRF_pre <- round(temp*100)
colRF_rec <- round( as.vector(table(results$original[rf_no]))[2] / as.vector(table(ground_truth.colombia$Y))[2] * 100)
# Existing coder
data$score = NA
for(i in 1:nrow(data)){
  human = data$province_human[i]%>% str_split(., ",") %>% unlist %>% unique
  icews = data$province_ICEWS[i]%>% str_split(., ",") %>% unlist %>% unique
  result = human %in% icews
  data$score[i] = length(which(result == TRUE))  /length(result)
}
colEXT_acc <- round(mean(data$score)*100)
evaluate_icews.colombia<-buildY(texts=data$cleantext,
                                namespace=names,
                                solution=data$province_ICEWS,
                                text_no=data$story_id)
icews_no<-which(evaluate_icews.colombia$Y==1)
temp<- as.vector(table(ground_truth.colombia$Y[icews_no]))[2] / (as.vector(table(ground_truth.colombia$Y[icews_no]))[1] + as.vector(table(ground_truth.colombia$Y[icews_no]))[2])
colEXT_pre <- round(temp*100)
colEXT_rec <- round(as.vector(table(ground_truth.colombia$Y[icews_no]))[2] / as.vector(table(ground_truth.colombia$Y))[2] * 100)
```

```{r table8, results='asis', echo=FALSE, message=FALSE}
##### Table ######
dt_china = data.frame(
  Results = c("Accuracy", "Precision", "Recall"),
  NNs = c(chinaNN_acc,chinaNN_pre,chinaNN_rec),
  SVM = c(chinaSVM_acc,chinaSVM_pre,chinaSVM_rec),
  RF = c(chinaRF_acc,chinaRF_pre,chinaRF_rec),
  ExistingCoders = c(chinaEXT_acc,chinaEXT_pre,chinaEXT_rec))
dt_drc = data.frame(
  Results = c("Accuracy", "Precision", "Recall"),
  NNs = c(drcNN_acc,drcNN_pre,drcNN_rec),
  SVM = c(drcSVM_acc,drcSVM_pre,drcSVM_rec),
  RF = c(drcRF_acc,drcRF_pre,drcRF_rec),
  ExistingCoders = c(drcEXT_acc,drcEXT_pre,drcEXT_rec))
dt_syria = data.frame(
  Results = c("Accuracy", "Precision", "Recall"),
  NNs = c(syriaNN_acc,syriaNN_pre,syriaNN_rec),
  SVM = c(syriaSVM_acc,syriaSVM_pre,syriaSVM_rec),
  RF = c(syriaRF_acc,syriaRF_pre,syriaRF_rec),
  ExistingCoders = c(syriaEXT_acc,syriaEXT_pre,syriaEXT_rec))
dt_col = data.frame(
  Results = c("Accuracy", "Precision", "Recall"),
  NNs = c(colNN_acc,colNN_pre,colNN_rec),
  SVM = c(colSVM_acc,colSVM_pre,colSVM_rec),
  RF = c(colRF_acc,colRF_pre,colRF_rec),
  ExistingCoders = c(colEXT_acc,colEXT_pre,colEXT_rec))
dt = rbind(dt_china, dt_drc, dt_syria, dt_col)
Country = c("", "China (ICEWS, Protest)", "",
                   "", "D.R.C. (ICEWS, Fight)","",
                   "", "Syria (Phoenix, Fight)", "",
                   "", "Colombia (ICEWS, Protest)", "")
dt = cbind(Country,dt)
colnames(dt)[1] = ""
kable(dt,caption = "Replicated Table 8")
```

```{r fig.width=4, fig.show='hold', fig.align="center",message=FALSE, echo=FALSE}
library(png)
library(grid)
library(gridExtra)
img1 <-  rasterGrob(as.raster(readPNG("tab8MS.png")), interpolate = T)
grid.arrange(img1,ncol = 1)
```

## Figure 3 
The ROC Curves and Precision & Recall Curves are presented for China, the DRC (Congo), Syria, and Colombia.

```{r fig3, fig.width=11, fig.height=6.5, results='asis', echo=FALSE, message=FALSE,fig.cap="Replicated Figure 3"}
par(mfrow=c(2,4))
# China ROC
plot(roc(china_results.all$machine.probabilities.nnet, as.factor(china_results.all$original)), lty=1, main="China", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2, lwd=2)
ob.svm<-roc(as.numeric(as.character(china_results.all$machine.probabilities.svm)), as.factor(china_results.all$original))
lines(ob.svm$fpr, ob.svm$tpr, lty=2, col='red', lwd=2)
ob.rf<-roc(as.numeric(as.character(china_results.all$machine.probabilities.rf)), as.factor(china_results.all$original) )
lines(ob.rf$fpr, ob.rf$tpr, lty=3, col='blue', lwd=2)
legend(0.65, 0.3, legend=c("NNet", "SVM", "R.For"), lty=1:3, col=c('black','red','blue'))
# DRC ROC
plot(roc(drc_results.all$machine.probabilities.nnet, as.factor(drc_results.all$original) ), lty=1, main="Congo (DRC)", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2, lwd=2)
ob.svm<-roc(drc_results.all$machine.probabilities.svm, as.factor(drc_results.all$original) )
lines(ob.svm$fpr, ob.svm$tpr, lty=2, col='red', lwd=2)
ob.rf<-roc(as.numeric(as.character(drc_results.all$machine.probabilities.rf)), as.factor(drc_results.all$original))
lines(ob.rf$fpr, ob.rf$tpr, lty=3, col='blue', lwd=2)
legend(0.65, 0.3, legend=c("NNet", "SVM", "R.For"), lty=1:3, col=c('black','red','blue'))
# Syria ROC
plot(roc(syria_results.all$machine.probabilities.nnet, as.factor(syria_results.all$original) ), lty=1, main="Syria", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2, lwd=2)
ob.svm<-roc(as.numeric(as.character(syria_results.all$machine.probabilities.svm)), as.factor(syria_results.all$original) )
lines(ob.svm$fpr, ob.svm$tpr, lty=2, col='red', lwd=2)
ob.rf<-roc(as.numeric(as.character(syria_results.all$machine.probabilities.rf)), as.factor(syria_results.all$original) )
lines(ob.rf$fpr, ob.rf$tpr, lty=3, col='blue', lwd=2)
legend(0.65, 0.3, legend=c("NNet", "SVM", "R.For"), lty=1:3, col=c('black','red','blue'))
# Colombia ROC
plot(roc(colombia_results.all$machine.probabilities.nnet, as.factor(colombia_results.all$original) ), lty=1, main="Colombia", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2, lwd=2)
ob.svm<-roc(colombia_results.all$machine.probabilities.svm,
            as.factor(colombia_results.all$original) )
lines(ob.svm$fpr, ob.svm$tpr, lty=2, col='red', lwd=2)
ob.rf<-roc(as.numeric(as.character(colombia_results.all$machine.probabilities.rf)),
           as.factor(colombia_results.all$original))
lines(ob.rf$fpr, ob.rf$tpr, lty=3, col='blue', lwd=2)
legend(0.65, 0.3, legend=c("NNet", "SVM", "R.For"), lty=1:3,
       col=c('black','red','blue'))

# China Precision and Recall
china.nnet <- rocdf(as.numeric(unlist(china_results.all$machine.probabilities.nnet)),
                    as.numeric(unlist(china_results.all$original)), type="pr")
china.svm<- rocdf(as.numeric(unlist(china_results.all$machine.probabilities.svm)),
                  as.numeric(unlist(china_results.all$original)), type="pr")
china.rf<- rocdf(as.numeric(unlist(china_results.all$machine.probabilities.rf)),
                 as.numeric(unlist(china_results.all$original)), type="pr")
china.icews<-rocdf(as.numeric(unlist(evaluate_icews.china$Y)),
                   as.numeric(unlist(ground_truth.china$Y)) , type="pr")
plot(china.nnet[, 1], china.nnet[, 2], type="l", lty=1, col="black", lwd=2, main="China", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2,
     xlab="Recall", ylab="Precision")
lines(china.svm[, 1], china.svm[, 2], type="l", lty=2, col='red', lwd=2)
lines(china.rf[, 1], china.rf[, 2], type="l", lty=3, col='blue', lwd=2)
lines(china.icews[,1], china.icews[,2], type="l", lty=4, col="burlywood4", lwd=2)
legend(0.6, 0.3, legend=c("NNet", "SVM", "R.For", "ICEWS"), lty=1:4, col=c('black','red','blue',"burlywood4"))

# DRC Precision and Recall
drc.nnet <- rocdf(as.numeric(unlist(drc_results.all$machine.probabilities.nnet)),
                  as.numeric(unlist(drc_results.all$original)), type="pr")
drc.svm<- rocdf(as.numeric(unlist(drc_results.all$machine.probabilities.svm)),
                as.numeric(unlist(drc_results.all$original)), type="pr")
drc.rf<- rocdf(as.numeric(unlist(drc_results.all$machine.probabilities.rf)),
               as.numeric(unlist(drc_results.all$original)), type="pr")
drc.icews<-rocdf(as.numeric(unlist(evaluate_icews.drc$Y)),
                 as.numeric(unlist(ground_truth.drc$Y)) , type="pr")
plot(drc.nnet[, 1], drc.nnet[, 2], type="l", lty=1, col="black", lwd=2,  main="Congo (DRC)", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2,
     xlab="Recall", ylab="Precision")
lines(drc.svm[, 1], drc.svm[, 2], type="l", lty=2, col='red', lwd=2)
lines(drc.rf[, 1], drc.rf[, 2], type="l", lty=3, col='blue', lwd=2)
lines(drc.icews[,1], drc.icews[,2], type="l", lty=4, col="burlywood4", lwd=2)
legend(0.6, 0.3, legend=c("NNet", "SVM", "R.For", "ICEWS"), lty=1:4, col=c('black','red','blue',"burlywood4"))

# Syria Precision and Recall
syria.nnet <- rocdf(as.numeric(unlist(syria_results.all$machine.probabilities.nnet)),
                    as.numeric(unlist(syria_results.all$original)), type="pr")
syria.svm<- rocdf(as.numeric(unlist(syria_results.all$machine.probabilities.svm)),
                  as.numeric(unlist(syria_results.all$original)), type="pr")
syria.rf<- rocdf(as.numeric(unlist(syria_results.all$machine.probabilities.rf)),
                 as.numeric(unlist(syria_results.all$original)), type="pr")
syria.phoenix<-rocdf(as.numeric(unlist(evaluate_phoenix.syria$Y)),
                     as.numeric(unlist(ground_truth.syria$Y)) , type="pr")
plot(syria.nnet[, 1], syria.nnet[, 2], type="l", lty=1, col="black", lwd=2, main="Syria", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2,
     xlab="Recall", ylab="Precision")
lines(syria.svm[, 1], syria.svm[, 2], type="l", lty=2, col='red', lwd=2)
lines(syria.rf[, 1], syria.rf[, 2], type="l", lty=3, col='blue', lwd=2)
lines(syria.phoenix[,1], syria.phoenix[,2], type="l", lty=4, col="burlywood4", lwd=2)
legend(0.58, 0.3, legend=c("NNet", "SVM", "R.For", "Phoenix"), lty=1:4, col=c('black','red','blue',"burlywood4"))

# Colombia Precision and Recall 
colombia.nnet <- rocdf(as.numeric(unlist(colombia_results.all$machine.probabilities.nnet)),
                       as.numeric(unlist(colombia_results.all$original)), type="pr")
colombia.svm<- rocdf(as.numeric(unlist(colombia_results.all$machine.probabilities.svm)),
                     as.numeric(unlist(colombia_results.all$original)), type="pr")
colombia.rf<- rocdf(as.numeric(unlist(colombia_results.all$machine.probabilities.rf)),
                    as.numeric(unlist(colombia_results.all$original)), type="pr")
colombia.icews<-rocdf(as.numeric(unlist(evaluate_icews.colombia$Y)),
                      as.numeric(unlist(ground_truth.colombia$Y)) , type="pr")
plot(colombia.nnet[, 1], colombia.nnet[, 2], type="l", lty=1, col="black", lwd=2, main="Colombia", #family="Garamond",
     cex.axis=1, cex.lab=1.5, cex.main=2,
     xlab="Recall", ylab="Precision")
lines(colombia.svm[, 1], colombia.svm[, 2], type="l", lty=2, col='red', lwd=2)
lines(colombia.rf[, 1], colombia.rf[, 2], type="l", lty=3, col='blue', lwd=2)
lines(colombia.icews[,1], colombia.icews[,2], type="l", lty=4, col="burlywood4", lwd=2)
legend(0.6, 0.3, legend=c("NNet", "SVM", "R.For", "ICEWS"), lty=1:4, col=c('black','red','blue',"burlywood4"))
```

```{r fig.align="left",message=FALSE, echo=FALSE}
library(png)
library(grid)
library(gridExtra)
img1 <-  rasterGrob(as.raster(readPNG("fig3MS.png")), interpolate = T)
grid.arrange(img1,ncol = 1)
```



Figure 4 is not replicated here. See plotly.R should you wish to replicate a static version of these plots.

## Conclusion

Please report any problems or address any questions to the corresponding authors,Sophie J. Lee (sophie.jiseon.lee@gmail.com),
Howard Liu (hao.liu@duke.edu),
Michael D. Ward (michael.don.ward@gmail.com)
